1 基于iris数据的示例
1.1 报告描述性统计量
- User提问: 在R中报告iris中各个变量的均值、中位数、标准差、最小值和最大值。
# 加载iris数据集
data(iris)
# 计算均值
mean_values <- round(colMeans(iris[, 1:4]), 3)
# 计算中位数
median_values <- sapply(iris[, 1:4], median)
# 计算标准差
sd_values <- round(sapply(iris[, 1:4], sd), 3)
# 计算最小值
min_values <- sapply(iris[, 1:4], min)
# 计算最大值
max_values <- sapply(iris[, 1:4], max)
# 创建一个数据框包含统计量
statistics <- data.frame(
Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
Mean = mean_values,
Median = median_values,
SD = sd_values,
Min = min_values,
Max = max_values
)
# 打印报告
print(statistics) Variable Mean Median SD Min Max
Sepal.Length Sepal.Length 5.843 5.80 0.828 4.3 7.9
Sepal.Width Sepal.Width 3.057 3.00 0.436 2.0 4.4
Petal.Length Petal.Length 3.758 4.35 1.765 1.0 6.9
Petal.Width Petal.Width 1.199 1.30 0.762 0.1 2.5
- User:
结果中最小值和最大值都只有1位小数,如何修改?
# 加载iris数据集
data(iris)
# 计算均值
mean_values <- format(round(colMeans(iris[, 1:4]), 3), nsmall = 3)
# 计算中位数
median_values <- format(round(sapply(iris[, 1:4], median), 3), nsmall = 3)
# 计算标准差
sd_values <- format(round(sapply(iris[, 1:4], sd), 3), nsmall = 3)
# 计算最小值
min_values <- format(round(sapply(iris[, 1:4], min), 3), nsmall = 3)
# 计算最大值
max_values <- format(round(sapply(iris[, 1:4], max), 3), nsmall = 3)
# 创建一个数据框包含统计量
statistics <- data.frame(
Variable = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
Mean = mean_values,
Median = median_values,
SD = sd_values,
Min = min_values,
Max = max_values
)
# 移除Variable列名
rownames(statistics) <- NULL
# 打印报告
print(statistics) Variable Mean Median SD Min Max
1 Sepal.Length 5.843 5.800 0.828 4.300 7.900
2 Sepal.Width 3.057 3.000 0.436 2.000 4.400
3 Petal.Length 3.758 4.350 1.765 1.000 6.900
4 Petal.Width 1.199 1.300 0.762 0.100 2.500
- User: 如何将数据框导出到word文档?
library(officer)
# 创建一个空白的Word文档
doc <- read_docx()
# 添加数据框
table <- as.data.frame(statistics)
doc <- body_add_table(doc, table)
# 保存Word文档
print(doc, target = "iris_table.docx")描述性统计分析导出到Word文档
- User:
按Species分组,报告各组均值,结果保留3位小数?
# 加载iris数据集
data(iris)
# 按Species分组计算均值,并保留3位小数
group_statistics <- aggregate(. ~ Species, data = iris, FUN = function(x) {
round(mean(x), 3)
})
# 打印分组均值
print(group_statistics) Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
1.2 绘制分组箱线图
- User提问:
按Species分组,绘制各变量的箱线图。
# 加载iris数据集
data(iris)
# 按Species分组绘制箱线图
par(mfrow = c(2, 2)) # 设置图形布局为2行2列
# 绘制Sepal.Length的箱线图
boxplot(Sepal.Length ~ Species, data = iris,
main = "Sepal Length", xlab = "Species", ylab = "Sepal Length")
# 绘制Sepal.Width的箱线图
boxplot(Sepal.Width ~ Species, data = iris, main = "Sepal Width", xlab = "Species", ylab = "Sepal Width")
# 绘制Petal.Length的箱线图
boxplot(Petal.Length ~ Species, data = iris, main = "Petal Length", xlab = "Species", ylab = "Petal Length")
# 绘制Petal.Width的箱线图
boxplot(Petal.Width ~ Species, data = iris, main = "Petal Width", xlab = "Species", ylab = "Petal Width") - User提问:
用ggplot2绘制按Species分组后,各变量的箱线图。
# 加载所需的库
library(ggplot2)
# 加载iris数据集
data(iris)
# 按Species分组绘制箱线图
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Sepal Length", x = "Species", y = "Sepal Length")ggplot(iris, aes(x = Species, y = Sepal.Width, fill = Species)) +
geom_boxplot() +
labs(title = "Sepal Width", x = "Species", y = "Sepal Width")ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
geom_boxplot() +
labs(title = "Petal Length", x = "Species", y = "Petal Length")ggplot(iris, aes(x = Species, y = Petal.Width, fill = Species)) +
geom_boxplot() +
labs(title = "Petal Width", x = "Species", y = "Petal Width") - User提问:让上述4个图形显示在同一窗口。
# 加载所需的库
library(ggplot2)
# 加载iris数据集
data(iris)
# 创建绘图对象,按Species分组绘制箱线图
p <- ggplot(iris, aes(x = Species, fill = Species)) +
labs(x = "Species") +
theme_minimal() +
theme(legend.position = "none")
# 添加Sepal.Length的箱线图
p1 <- p +
geom_boxplot(aes(y = Sepal.Length)) +
labs(title = "Sepal Length")
# 添加Sepal.Width的箱线图
p2 <- p +
geom_boxplot(aes(y = Sepal.Width)) +
labs(title = "Sepal Width")
# 添加Petal.Length的箱线图
p3 <- p +
geom_boxplot(aes(y = Petal.Length)) +
labs(title = "Petal Length")
# 添加Petal.Width的箱线图
p4 <- p +
geom_boxplot(aes(y = Petal.Width)) +
labs(title = "Petal Width")
# 在同一窗口中显示四个箱线图
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2) - User提问: Error in loadNamespace(x) :
there is no package called ‘gridExtra’
1.3 方差分析
- User提问:
将iris按照Species分组,检验三个组别的Sepal.length的均值是否相等?
# 加载所需的库
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggplot2)
# 加载iris数据集
data(iris)
# 进行方差分析
anova_result <- aov(Sepal.Length ~ Species, data = iris)
# 检验均值是否相等
anova_summary <- summary(anova_result)
# 提取p-value
p_value <- anova_summary[[1]]$"Pr(>F)"[1]
# 输出方差分析结果
print(anova_summary) Df Sum Sq Mean Sq F value Pr(>F)
Species 2 63.21 31.606 119.3 <2e-16 ***
Residuals 147 38.96 0.265
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n")cat("p-value:", p_value)p-value: 1.669669e-31
- User提问:
该数据满足ANOVA检验的适用条件吗?
- User提问:
iris数据集每个组别内的Sepal.Length观测值是否近似正态分布?
# 加载所需的库
library(dplyr)
library(ggplot2)
# 加载iris数据集
data(iris)
# 按Species分组,进行正态性检验
normality_test <- iris %>%
group_by(Species) %>%
summarise(p_value = shapiro.test(Sepal.Length)$p.value)
# 输出正态性检验结果
print(normality_test)# A tibble: 3 × 2
Species p_value
<fct> <dbl>
1 setosa 0.460
2 versicolor 0.465
3 virginica 0.258
- User提问: R 中进行方差齐性检验的函数是什么?
- User提问: bartlett.test()函数的结果如何看?
bartlett.test(iris$Sepal.Length,iris$Species)
Bartlett test of homogeneity of variances
data: iris$Sepal.Length and iris$Species
Bartlett's K-squared = 16.006, df = 2, p-value = 0.0003345
- User提问: iris数据集不同组别之间的Sepal.Length观测值方差不等,还能用ANOVA检验吗?
- User提问: Welch’s ANOVA的命令是什么?
# 导入iris数据集
data(iris)
# 执行Welch's ANOVA
result <- oneway.test(Sepal.Length ~ Species, data = iris)
# 打印结果
print(result)
One-way analysis of means (not assuming equal variances)
data: Sepal.Length and Species
F = 138.91, num df = 2.000, denom df = 92.211, p-value < 2.2e-16
1.4 建立判别函数
- User提问:基于iris数据,建立Species的判别函数?
- User提问: could not find function “lda”
# Install and load the MASS package
#install.packages("MASS")
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
# Load the iris dataset
data(iris)
# Perform LDA
lda_model <- lda(Species ~ ., data = iris)
# Print the discriminant function results
print(lda_model)Call:
lda(Species ~ ., data = iris)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa 5.006 3.428 1.462 0.246
versicolor 5.936 2.770 4.260 1.326
virginica 6.588 2.974 5.552 2.026
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length 0.8293776 -0.02410215
Sepal.Width 1.5344731 -2.16452123
Petal.Length -2.2012117 0.93192121
Petal.Width -2.8104603 -2.83918785
Proportion of trace:
LD1 LD2
0.9912 0.0088
- User提问:输出结果如何解读?
-
User提问:如何利用判别函数判定一朵鸢尾花属于哪个组别?
# 用于预测的鸢尾花特征向量
new_iris <- data.frame(Sepal.Length = 5.1, Sepal.Width = 3.5, Petal.Length = 1.4, Petal.Width = 0.2)
# 使用判别函数进行预测
predicted_group <- predict(lda_model, newdata = new_iris)
# 输出预测结果
print(predicted_group$class)[1] setosa
Levels: setosa versicolor virginica
1.5 向ChatGPT提问技巧
如何在R中实现XX计算/检验/方法?请用R自带的数据集举例说明。
用XX包在R中实现XX计算/检验?请用R自带的数据集举例说明。
- 指定包的名称,例如ggplot2
XX计算/检验/方法的适用条件是什么?
在R中如何考察XX方法/检验的适用条件是否满足?
如何解读XX函数的输出结果的含义?
R中的XX分析有哪些可视化工具?
2 如何借助chatGPT开展统计分析和撰写R代码?
2.1 描述数据结构
- User提问: 我有控制组和对照组各10人的血压数据,如何判断两个组别的血压是否有显著差距?
- User提问: 我的两个组别只有10人,能用独立样本t检验吗?
2.2 统计方法的选择:样本容量、数据类型、分布特征
- User提问: 还有别的方法吗?
- User提问: Wilcoxon Signed-Rank Test的R代码是什么?
-
User提问:我有控制组和对照组各10人的跟踪观测服药7天,14天和21天的血压数据,可以开展什么统计分析,来判断药物是否有效?
2.3 请求chatGPT创建模拟数据
- User提问:请在R中生成上述模拟数据,及实现Repeated Measures ANOVA的代码?
# 生成模拟数据
set.seed(123) # 设置随机种子以保证结果的可重复性
# 创建数据框
data <- data.frame(
Subject = rep(1:10, each = 3), # 受试者编号,重复测量所以有10个受试者
Time = rep(c("Day 7", "Day 14", "Day 21"), times = 10), # 时间点
Group = rep(c("Control", "Treatment"), each = 15), # 组别
BloodPressure = rnorm(30, mean = c(120, 130, 125), sd = c(5, 7, 6)) # 血压数据
)
# 查看数据
print(data) Subject Time Group BloodPressure
1 1 Day 7 Control 117.1976
2 1 Day 14 Control 128.3888
3 1 Day 21 Control 134.3522
4 2 Day 7 Control 120.3525
5 2 Day 14 Control 130.9050
6 2 Day 21 Control 135.2904
7 3 Day 7 Control 122.3046
8 3 Day 14 Control 121.1446
9 3 Day 21 Control 120.8789
10 4 Day 7 Control 117.7717
11 4 Day 14 Control 138.5686
12 4 Day 21 Control 127.1589
13 5 Day 7 Control 122.0039
14 5 Day 14 Control 130.7748
15 5 Day 21 Control 121.6650
16 6 Day 7 Treatment 128.9346
17 6 Day 14 Treatment 133.4850
18 6 Day 21 Treatment 113.2003
19 7 Day 7 Treatment 123.5068
20 7 Day 14 Treatment 126.6905
21 7 Day 21 Treatment 118.5931
22 8 Day 7 Treatment 118.9101
23 8 Day 14 Treatment 122.8180
24 8 Day 21 Treatment 120.6267
25 9 Day 7 Treatment 116.8748
26 9 Day 14 Treatment 118.1931
27 9 Day 21 Treatment 130.0267
28 10 Day 7 Treatment 120.7669
29 10 Day 14 Treatment 122.0330
30 10 Day 21 Treatment 132.5229
# 进行重复测量方差分析(Repeated Measures ANOVA)
library(nlme)
Attaching package: 'nlme'
The following object is masked from 'package:dplyr':
collapse
# 设置数据为重复测量设计
data$Subject <- factor(data$Subject)
data$Time <- factor(data$Time)
# 执行重复测量方差分析
model <- lme(BloodPressure ~ Time * Group, data = data, random = ~ 1 | Subject)
anova_result <- anova(model)
# 打印重复测量方差分析结果
print(anova_result) numDF denDF F-value p-value
(Intercept) 1 16 13201.338 <.0001
Time 2 16 3.112 0.0721
Group 1 8 1.635 0.2369
Time:Group 2 16 1.151 0.3412
2.4 修改chatGPT生成的代码
2.5 解读输出结果
- User提问: 解释结果 > # 打印重复测量方差分析结果
print(anova_result)
numDF denDF F-value p-value
(Intercept) 1 16 13201.338 <.0001
Time 2 16 3.112 0.0721
Group 1 8 1.635 0.2369
Time:Group 2 16 1.151 0.3412